Fragestellung: “Die Manschafft, die zur Halbzeit vorne liegt, gewinnt mit einer Chance von mindestens 75% auch das Spiel. Falls zur Halbzeit unentschieden ist, gewinnt eher das Heimteam.”

Dafür nehmen wir den Datacamp Datensatz Soccer Data

Als Einführung werden wir auf Datacamp folgende Kurse durchgehen:

# Bibliotheken importieren
library("plotly")
library("plyr")
library("dplyr")
library("forcats")
library("RColorBrewer")

Daten einlesen und Dataframe erstellen

# List files in folder "Data"
files <- list.files(path="./Data/", pattern=NULL, all.files=FALSE, full.names=TRUE)

# Create Dataframe with all csv from years 2015-2019
df <- ldply(.data = files, .fun = read.csv)

View(df)
# Count frequency of haltime & fulltime results
df_htr <- df %>% count(HTR)
df_ftr <- df %>% count(FTR)
# Create dataframe with halttime & fulltime result amounts
df_results <- data.frame(c("Away", "Draw", "Home"), c(df_htr$n), c(df_ftr$n))

# Rename column headers
col_headings <- c('Result','Halftime','Fulltime')
names(df_results) <- col_headings

# Plot grouped bar chart to visualize halftime & fulltime results
fig <- plot_ly(
  df_results, x = ~Results, y = ~Halftime, type = 'bar', name = 'Half Time Score') %>% 
  add_trace(y = ~Fulltime, name = 'Full Time Score') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')

fig
# merge HTR & FTR to 1 column
df$result <- paste(df$HTR, df$FTR)

print("Example: H H = home team is winning at halftime and also wins the game at fulltime")
[1] "Example: H H = home team is winning at halftime and also wins the game at fulltime"
# Plot all different game progresses and their amount
df %>%
  count(result) %>%
  mutate(result = fct_reorder(result, n, .desc = TRUE)) %>%
  plot_ly(x = ~result, y = ~n, text = ~n, textposition = 'auto') %>%
  add_bars() %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Amount"),
         title = "How are the different game progresses distributed?")
# Group by game outcome & calculate probability of all outcomes
df_count_results <- df %>% 
  group_by(result) %>% 
  summarise(count_result = round(n() / nrow(df) * 100, digits = 2))

df_count_results %>%
  plot_ly(labels = ~result, values = ~count_result) %>%
  add_pie(hole = 0.4, color = I("white")) %>%
  layout(xaxis = list(title = "Game Progress"),
         yaxis = list(title = "Probability %"),
         title = "What is the probability of each game progress?")
# Calculate probability 
calc_prob <- function(df1, df2) {
  prob <- round((100 / nrow(df1) * nrow(df2)), digits = 2)
  return(prob)
}
# Filter home teams winning at halftime
df_ht_home <- df %>% 
  filter(HTR == "H")

# Filter home teams winning at halftime & fulltime
df_ft_home <- df_ht_home %>% 
  filter(FTR == "H")

home_win_prob <- calc_prob(df_ht_home, df_ft_home)

cat("Probability that the home team wins the game if they are leading at half time: ", home_win_prob, "%")
Probability that the home team wins the game if they are leading at half time:  82.55 %
# Filter away teams winning at halftime
df_ht_away <- df %>% 
  filter(HTR == "A")

# Filter away teams winning at halftime & fulltime
df_ft_away <- df_ht_away %>% 
  filter(FTR == "A")

away_win_prob <- calc_prob(df_ht_away, df_ft_away)

cat("Probability that the away team wins the game if they are leading at half time: ", away_win_prob, "%")
Probability that the away team wins the game if they are leading at half time:  72.03 %
# Filter draw at halftime
df_ht_draw <- df %>% 
  filter(HTR == "D")

# Filter draw at halftime & fulltime
df_ft_draw <- df_ht_draw %>% 
  filter(FTR == "D")

draw_prob <- calc_prob(df_ht_draw, df_ft_draw)

cat("Probability that the game ends in a draw if the halftime result is also a draw: ", draw_prob, "%")
Probability that the game ends in a draw if the halftime result is also a draw:  36.45 %
# Filter draw at halftime & the home team winning at fulltime
df_ht_draw_ft_home_win <- df_ht_draw %>%
  filter(FTR == "H")

home_win_after_ht_draw_prob <- calc_prob(df_ht_draw, df_ht_draw_ft_home_win)

cat("Probability that the home team wins if the halftime result is a draw: ", home_win_after_ht_draw_prob, "%")
Probability that the home team wins if the halftime result is a draw:  38.03 %
# Probability that the team winning at half time wins the game
ht_ft_win_prob <- round(((home_win_prob * nrow(df_ft_home)) + (away_win_prob * nrow(df_ft_away))) / (nrow(df_ft_home) + nrow(df_ft_away)), digits = 2)

cat("Probability that the team leading at half time wins the entire game: ", ht_ft_win_prob, "%")
Probability that the team leading at half time wins the entire game:  78.41 %
LS0tCnRpdGxlOiAiRGF0YXZpeiBtaXQgUGxvdGx5IFBMIERhdGEiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIEZyYWdlc3RlbGx1bmc6ICJEaWUgTWFuc2NoYWZmdCwgZGllIHp1ciBIYWxiemVpdCB2b3JuZSBsaWVndCwgZ2V3aW5udCBtaXQgZWluZXIgQ2hhbmNlIHZvbiBtaW5kZXN0ZW5zIDc1JSBhdWNoIGRhcyBTcGllbC4gRmFsbHMgenVyIEhhbGJ6ZWl0IHVuZW50c2NoaWVkZW4gaXN0LCBnZXdpbm50IGVoZXIgZGFzIEhlaW10ZWFtLiIKCgpEYWbDvHIgbmVobWVuIHdpciBkZW4gRGF0YWNhbXAgRGF0ZW5zYXR6IFtTb2NjZXIgRGF0YV0oaHR0cHM6Ly9hcHAuZGF0YWNhbXAuY29tL3dvcmtzcGFjZS9kYXRhc2V0cy9kYXRhc2V0LXB5dGhvbi1zb2NjZXIpCgpBbHMgRWluZsO8aHJ1bmcgd2VyZGVuIHdpciBhdWYgRGF0YWNhbXAgZm9sZ2VuZGUgS3Vyc2UgZHVyY2hnZWhlbjoKCi0gW0ludGVyYWN0aXZlIERhdGEgVmlzdWFsaXphdGlvbiB3aXRoIHBsb3RseV0oaHR0cHM6Ly9hcHAuZGF0YWNhbXAuY29tL2xlYXJuL2NvdXJzZXMvaW50ZXJhY3RpdmUtZGF0YS12aXN1YWxpemF0aW9uLXdpdGgtcGxvdGx5LWluLXIpCgotIFtJbnRlcm1lZGlhdGUgSW50ZXJhY3RpdmUgRGF0YSBWaXN1YWxpemF0aW9uIHdpdGggcGxvdGx5XShodHRwczovL2FwcC5kYXRhY2FtcC5jb20vbGVhcm4vY291cnNlcy9pbnRlcmFjdGl2ZS1kYXRhLXZpc3VhbGl6YXRpb24td2l0aC1wbG90bHktaW4tcikKCmBgYHtyfQojIEJpYmxpb3RoZWtlbiBpbXBvcnRpZXJlbgpsaWJyYXJ5KCJwbG90bHkiKQpsaWJyYXJ5KCJwbHlyIikKbGlicmFyeSgiZHBseXIiKQpsaWJyYXJ5KCJmb3JjYXRzIikKbGlicmFyeSgiUkNvbG9yQnJld2VyIikKYGBgCiMjIERhdGVuIGVpbmxlc2VuIHVuZCBEYXRhZnJhbWUgZXJzdGVsbGVuCmBgYHtyfQojIExpc3QgZmlsZXMgaW4gZm9sZGVyICJEYXRhIgpmaWxlcyA8LSBsaXN0LmZpbGVzKHBhdGg9Ii4vRGF0YS8iLCBwYXR0ZXJuPU5VTEwsIGFsbC5maWxlcz1GQUxTRSwgZnVsbC5uYW1lcz1UUlVFKQoKIyBDcmVhdGUgRGF0YWZyYW1lIHdpdGggYWxsIGNzdiBmcm9tIHllYXJzIDIwMTUtMjAxOQpkZiA8LSBsZHBseSguZGF0YSA9IGZpbGVzLCAuZnVuID0gcmVhZC5jc3YpCgpWaWV3KGRmKQpgYGAKCmBgYHtyfQojIENvdW50IGZyZXF1ZW5jeSBvZiBoYWx0aW1lICYgZnVsbHRpbWUgcmVzdWx0cwpkZl9odHIgPC0gZGYgJT4lIGNvdW50KEhUUikKZGZfZnRyIDwtIGRmICU+JSBjb3VudChGVFIpCmBgYAoKYGBge3J9CiMgQ3JlYXRlIGRhdGFmcmFtZSB3aXRoIGhhbHR0aW1lICYgZnVsbHRpbWUgcmVzdWx0IGFtb3VudHMKZGZfcmVzdWx0cyA8LSBkYXRhLmZyYW1lKGMoIkF3YXkiLCAiRHJhdyIsICJIb21lIiksIGMoZGZfaHRyJG4pLCBjKGRmX2Z0ciRuKSkKCiMgUmVuYW1lIGNvbHVtbiBoZWFkZXJzCmNvbF9oZWFkaW5ncyA8LSBjKCdSZXN1bHQnLCdIYWxmdGltZScsJ0Z1bGx0aW1lJykKbmFtZXMoZGZfcmVzdWx0cykgPC0gY29sX2hlYWRpbmdzCgojIFBsb3QgZ3JvdXBlZCBiYXIgY2hhcnQgdG8gdmlzdWFsaXplIGhhbGZ0aW1lICYgZnVsbHRpbWUgcmVzdWx0cwpmaWcgPC0gcGxvdF9seSgKICBkZl9yZXN1bHRzLCB4ID0gflJlc3VsdHMsIHkgPSB+SGFsZnRpbWUsIHR5cGUgPSAnYmFyJywgbmFtZSA9ICdIYWxmIFRpbWUgU2NvcmUnKSAlPiUgCiAgYWRkX3RyYWNlKHkgPSB+RnVsbHRpbWUsIG5hbWUgPSAnRnVsbCBUaW1lIFNjb3JlJykgJT4lCiAgbGF5b3V0KHlheGlzID0gbGlzdCh0aXRsZSA9ICdDb3VudCcpLCBiYXJtb2RlID0gJ2dyb3VwJykKCmZpZwpgYGAKYGBge3J9CiMgbWVyZ2UgSFRSICYgRlRSIHRvIDEgY29sdW1uCmRmJHJlc3VsdCA8LSBwYXN0ZShkZiRIVFIsIGRmJEZUUikKCnByaW50KCJFeGFtcGxlOiBIIEggPSBob21lIHRlYW0gaXMgd2lubmluZyBhdCBoYWxmdGltZSBhbmQgYWxzbyB3aW5zIHRoZSBnYW1lIGF0IGZ1bGx0aW1lIikKYGBgCgpgYGB7cn0KIyBQbG90IGFsbCBkaWZmZXJlbnQgZ2FtZSBwcm9ncmVzc2VzIGFuZCB0aGVpciBhbW91bnQKZGYgJT4lCiAgY291bnQocmVzdWx0KSAlPiUKICBtdXRhdGUocmVzdWx0ID0gZmN0X3Jlb3JkZXIocmVzdWx0LCBuLCAuZGVzYyA9IFRSVUUpKSAlPiUKICBwbG90X2x5KHggPSB+cmVzdWx0LCB5ID0gfm4sIHRleHQgPSB+biwgdGV4dHBvc2l0aW9uID0gJ2F1dG8nKSAlPiUKICBhZGRfYmFycygpICU+JQogIGxheW91dCh4YXhpcyA9IGxpc3QodGl0bGUgPSAiR2FtZSBQcm9ncmVzcyIpLAogICAgICAgICB5YXhpcyA9IGxpc3QodGl0bGUgPSAiQW1vdW50IiksCiAgICAgICAgIHRpdGxlID0gIkhvdyBhcmUgdGhlIGRpZmZlcmVudCBnYW1lIHByb2dyZXNzZXMgZGlzdHJpYnV0ZWQ/IikKYGBgCmBgYHtyfQojIEdyb3VwIGJ5IGdhbWUgb3V0Y29tZSAmIGNhbGN1bGF0ZSBwcm9iYWJpbGl0eSBvZiBhbGwgb3V0Y29tZXMKZGZfY291bnRfcmVzdWx0cyA8LSBkZiAlPiUgCiAgZ3JvdXBfYnkocmVzdWx0KSAlPiUgCiAgc3VtbWFyaXNlKGNvdW50X3Jlc3VsdCA9IHJvdW5kKG4oKSAvIG5yb3coZGYpICogMTAwLCBkaWdpdHMgPSAyKSkKCmRmX2NvdW50X3Jlc3VsdHMgJT4lCiAgcGxvdF9seShsYWJlbHMgPSB+cmVzdWx0LCB2YWx1ZXMgPSB+Y291bnRfcmVzdWx0KSAlPiUKICBhZGRfcGllKGhvbGUgPSAwLjQsIGNvbG9yID0gSSgid2hpdGUiKSkgJT4lCiAgbGF5b3V0KHhheGlzID0gbGlzdCh0aXRsZSA9ICJHYW1lIFByb2dyZXNzIiksCiAgICAgICAgIHlheGlzID0gbGlzdCh0aXRsZSA9ICJQcm9iYWJpbGl0eSAlIiksCiAgICAgICAgIHRpdGxlID0gIldoYXQgaXMgdGhlIHByb2JhYmlsaXR5IG9mIGVhY2ggZ2FtZSBwcm9ncmVzcz8iKQpgYGAKCmBgYHtyfQojIENhbGN1bGF0ZSBwcm9iYWJpbGl0eSAKY2FsY19wcm9iIDwtIGZ1bmN0aW9uKGRmMSwgZGYyKSB7CiAgcHJvYiA8LSByb3VuZCgoMTAwIC8gbnJvdyhkZjEpICogbnJvdyhkZjIpKSwgZGlnaXRzID0gMikKICByZXR1cm4ocHJvYikKfQpgYGAKCmBgYHtyfQojIEZpbHRlciBob21lIHRlYW1zIHdpbm5pbmcgYXQgaGFsZnRpbWUKZGZfaHRfaG9tZSA8LSBkZiAlPiUgCiAgZmlsdGVyKEhUUiA9PSAiSCIpCgojIEZpbHRlciBob21lIHRlYW1zIHdpbm5pbmcgYXQgaGFsZnRpbWUgJiBmdWxsdGltZQpkZl9mdF9ob21lIDwtIGRmX2h0X2hvbWUgJT4lIAogIGZpbHRlcihGVFIgPT0gIkgiKQoKaG9tZV93aW5fcHJvYiA8LSBjYWxjX3Byb2IoZGZfaHRfaG9tZSwgZGZfZnRfaG9tZSkKCmNhdCgiUHJvYmFiaWxpdHkgdGhhdCB0aGUgaG9tZSB0ZWFtIHdpbnMgdGhlIGdhbWUgaWYgdGhleSBhcmUgbGVhZGluZyBhdCBoYWxmIHRpbWU6ICIsIGhvbWVfd2luX3Byb2IsICIlIikKYGBgCgpgYGB7cn0KIyBGaWx0ZXIgYXdheSB0ZWFtcyB3aW5uaW5nIGF0IGhhbGZ0aW1lCmRmX2h0X2F3YXkgPC0gZGYgJT4lIAogIGZpbHRlcihIVFIgPT0gIkEiKQoKIyBGaWx0ZXIgYXdheSB0ZWFtcyB3aW5uaW5nIGF0IGhhbGZ0aW1lICYgZnVsbHRpbWUKZGZfZnRfYXdheSA8LSBkZl9odF9hd2F5ICU+JSAKICBmaWx0ZXIoRlRSID09ICJBIikKCmF3YXlfd2luX3Byb2IgPC0gY2FsY19wcm9iKGRmX2h0X2F3YXksIGRmX2Z0X2F3YXkpCgpjYXQoIlByb2JhYmlsaXR5IHRoYXQgdGhlIGF3YXkgdGVhbSB3aW5zIHRoZSBnYW1lIGlmIHRoZXkgYXJlIGxlYWRpbmcgYXQgaGFsZiB0aW1lOiAiLCBhd2F5X3dpbl9wcm9iLCAiJSIpCmBgYAoKYGBge3J9CiMgRmlsdGVyIGRyYXcgYXQgaGFsZnRpbWUKZGZfaHRfZHJhdyA8LSBkZiAlPiUgCiAgZmlsdGVyKEhUUiA9PSAiRCIpCgojIEZpbHRlciBkcmF3IGF0IGhhbGZ0aW1lICYgZnVsbHRpbWUKZGZfZnRfZHJhdyA8LSBkZl9odF9kcmF3ICU+JSAKICBmaWx0ZXIoRlRSID09ICJEIikKCmRyYXdfcHJvYiA8LSBjYWxjX3Byb2IoZGZfaHRfZHJhdywgZGZfZnRfZHJhdykKCmNhdCgiUHJvYmFiaWxpdHkgdGhhdCB0aGUgZ2FtZSBlbmRzIGluIGEgZHJhdyBpZiB0aGUgaGFsZnRpbWUgcmVzdWx0IGlzIGFsc28gYSBkcmF3OiAiLCBkcmF3X3Byb2IsICIlIikKYGBgCgpgYGB7cn0KIyBGaWx0ZXIgZHJhdyBhdCBoYWxmdGltZSAmIHRoZSBob21lIHRlYW0gd2lubmluZyBhdCBmdWxsdGltZQpkZl9odF9kcmF3X2Z0X2hvbWVfd2luIDwtIGRmX2h0X2RyYXcgJT4lCiAgZmlsdGVyKEZUUiA9PSAiSCIpCgpob21lX3dpbl9hZnRlcl9odF9kcmF3X3Byb2IgPC0gY2FsY19wcm9iKGRmX2h0X2RyYXcsIGRmX2h0X2RyYXdfZnRfaG9tZV93aW4pCgpjYXQoIlByb2JhYmlsaXR5IHRoYXQgdGhlIGhvbWUgdGVhbSB3aW5zIGlmIHRoZSBoYWxmdGltZSByZXN1bHQgaXMgYSBkcmF3OiAiLCBob21lX3dpbl9hZnRlcl9odF9kcmF3X3Byb2IsICIlIikKYGBgCgpgYGB7cn0KIyBQcm9iYWJpbGl0eSB0aGF0IHRoZSB0ZWFtIHdpbm5pbmcgYXQgaGFsZiB0aW1lIHdpbnMgdGhlIGdhbWUKaHRfZnRfd2luX3Byb2IgPC0gcm91bmQoKChob21lX3dpbl9wcm9iICogbnJvdyhkZl9mdF9ob21lKSkgKyAoYXdheV93aW5fcHJvYiAqIG5yb3coZGZfZnRfYXdheSkpKSAvIChucm93KGRmX2Z0X2hvbWUpICsgbnJvdyhkZl9mdF9hd2F5KSksIGRpZ2l0cyA9IDIpCgpjYXQoIlByb2JhYmlsaXR5IHRoYXQgdGhlIHRlYW0gbGVhZGluZyBhdCBoYWxmIHRpbWUgd2lucyB0aGUgZW50aXJlIGdhbWU6ICIsIGh0X2Z0X3dpbl9wcm9iLCAiJSIpCmBgYAoKYGBge3J9CgpgYGAKCg==